# 160510 - script to blast OTUs against BLOD - anna script, fixed
setwd("~/Documents/UNI_und_VORLESUNGEN/11 phd projects/1 Meta SCHMALNAU/2 HiSeq biomass 160707/8 taxonomy")

#install.packages("bold")
library("bold")
library("seqinr")
library("XML")

fasta <- c("../7 OTUs/D) OTU_KEEP.txt")
FOLDER <- c("1 BOLD_data")

mm <- 1

data <- readLines(fasta[mm])
ID <- grep(">", data)
rigth <- c(ID[-1]-1, length(data))

L <- NULL
for (i in 1:length(ID)){
L[i] <- paste(data[(ID[i]+1):rigth[i]], sep="", collapse="")
}

data <- cbind("ID"= data[ID], "sequ"=L)
data <- data.frame(data, stringsAsFactors=F)
# use APIs to balst sequences agains BOLD etc...



data <- data

for (i in 1:nrow(data)){

hittable <- NULL

hittable <- bold_identify(data$sequ[i], db="COX1")
revcomp <- paste(rev(comp(strsplit(data$sequ[i], "")[[1]])), collapse="")
if(is.null(hittable[[1]])){ # no hits, check rev comp
hittable <- bold_identify(revcomp, db="COX1")[[1]]} else { # hits FW, add rev com hits
	temp <- bold_identify(revcomp, db="COX1")[[1]]
	if(!is.null(temp[[1]])){hittable <- rbind(temp[[1]], hittable)}
}
# write bold results in file!

name <- gsub(">(OTU_.*);size.*", "\\1", data$ID[i])
write.table(hittable, file=paste(FOLDER[mm], "/", name, ".csv", collapse="", sep=""), sep="\t")
paste(i)

}



##############################
# read in hit tables and steal taxonomic information from bold! 
otu_files <- list.files(FOLDER[mm], full.names=T, pattern=".csv")
s <- 1
s


for(s in 307:length(otu_files)){ # adjust numbers here after crash

if(readLines(otu_files[s])[1]!="\"\""){

tab <- read.table(otu_files[s], stringsAsFactors=F)

bins <- bold_specimens(ids=tab$ID)# get bin uri
tab_bins <- merge(tab, bins, by.x="ID", by.y="processid")
tab_bins <- tab_bins[order(tab_bins$similarity, decreasing=T),]


IDtab <- data.frame("ID"=tab_bins$ID, "bin_uri"=tab_bins$bin_uri, "similarity"=tab_bins$similarity, "country"=tab_bins$specimen_country, "orig_tax"=tab_bins$taxonomicidentification, "Order"=NA, "Family"=NA, "Subfamily"=NA, "Genus"=NA, "Species"=NA, stringsAsFactors=F)

unique_bins <- unique(IDtab$bin_uri) # get uinique bins

unique_bins <- unique_bins[unique_bins!=" "] # remove missing data?

if (is.na(unique_bins[1])){}else{

for (j in 1:length(unique_bins)){
html <- htmlParse(paste("http://www.boldsystems.org/index.php/Public_BarcodeCluster?clusteruri=", unique_bins[j], sep=""), encoding="UCS-2LE")

html <- readHTMLTable(html, stringsAsFactors=F) 
length(html)

if (length(html)==2){
html <- readLines(paste("http://www.boldsystems.org/index.php/Public_BarcodeCluster?clusteruri=", unique_bins[j], sep=""), encoding="UCS-2LE", warn=F)

newID <- html[grep("has been synonymized", html)]
newID <- sub(".*clusteruri=(.*)\".*", "\\1", newID)
html <- htmlParse(paste("http://www.boldsystems.org/index.php/Public_BarcodeCluster?clusteruri=", newID, sep=""), encoding="UCS-2LE")
html <- readHTMLTable(html, stringsAsFactors=F) 

}

if(length(html)!=2){
order <- html[[16]]$V3[which(html[[16]]$V2=="Order:")]
IDtab[IDtab$bin_uri==unique_bins[j], 6] <- sub(" \\[.*\\]", "", order)
family <- html[[16]]$V3[which(html[[16]]$V2=="Family:")]
IDtab[IDtab$bin_uri==unique_bins[j], 7] <- sub(" \\[.*\\]", "", family)
subfamily <- html[[16]]$V3[which(html[[16]]$V2=="Subfamily:")]
IDtab[IDtab$bin_uri==unique_bins[j], 8] <- sub(" \\[.*\\]", "", subfamily)
genus <- html[[16]]$V3[which(html[[16]]$V2=="Genus:")]
IDtab[IDtab$bin_uri==unique_bins[j], 9] <- sub(" \\[.*\\]", "", genus)
species <- html[[16]]$V3[which(html[[16]]$V2=="Species:")]
IDtab[IDtab$bin_uri==unique_bins[j], 10] <- sub(" \\[.*\\]", "", species)
}
}
write.table(IDtab, paste(sub(".csv", "", otu_files[s]), "_hacked.txt", sep="", collapse=""), sep="\t")}
}

}



#make awesome plot!

plottables <- list.files(FOLDER[mm], full.names=T, pattern="_hacked.txt")

for (f in 1:length(plottables)){

plotdata <- read.table(plottables[f], stringsAsFactors=F)
plotdata <- rbind(names(plotdata), rep("", ncol(plotdata)), plotdata)


OTU <- sub(".+(OTU_.*)_hacked.*", "\\1", plottables[f])

pdf(paste(FOLDER[mm], "/", OTU, "_plot.pdf", sep="", collapse=""), width=11, height=15)
par(mar=c(0,0,0,0))
plot(NULL, xlim=c(0,100), ylim=c(0, 125), xlab="", ylab="", xaxt="n", yaxt="n", bty="n")

# plot similarity
identSeq <- seq(50, 100, by=0.5)
color <- cbind(unique(plotdata$bin_uri[-c(1,2)]), rep(c("skyblue1", "dodgerblue", "khaki2", "lightcoral", "lightgreen", "mediumorchid1", "brown1", "turquoise1", "lightgray", "yellow", "hotpink",  "forestgreen", "green",  "orange"), 100)[1:length(unique(plotdata$bin_uri[-c(1,2)]))])

mysim <- as.numeric(plotdata$similarity[-c(1,2)])
mysim <- mysim-0.8
mysim[mysim<0] <- 0

rect(49, 104, 101, 126)
rect(identSeq[1:length(mysim)]-0.25, 105, identSeq[1:length(mysim)]+0.25, 105+ mysim*100, border=NA, col=color[,2][match(plotdata$bin_uri[-c(1,2)], color[,1])])

# plot text
n <- nrow(plotdata)

# bin background!
heycol <- color[,2][match(plotdata$bin_uri[-c(1,2)], color[,1])]
rect(0, 99.5-n:3, 96, 100.5-n:3, col=rev(heycol), border=NA)




# texts
text(0, 100-n:1, plotdata$ID[n:1], adj=0, cex=0.5)
text(8, 100-n:1, plotdata$bin_uri[n:1], adj=0, cex=0.5)
text(15, 100-n:1, plotdata$orig_tax[n:1], adj=0, cex=0.5)
text(25, 100-n:1, plotdata$country[n:1], adj=0, cex=0.5)
text(34, 100-n:1, plotdata$Order[n:1], adj=0, cex=0.5)
text(44, 100-n:1, plotdata$Family[n:1], adj=0, cex=0.5)
text(55, 100-n:1, plotdata$Subfamily[n:1], adj=0, cex=0.5)
text(68, 100-n:1, plotdata$Genus[n:1], adj=0, cex=0.5)
text(81, 100-n:1, plotdata$Species[n:1], adj=0, cex=0.5)

simcol <- plotdata$similarity[n:3]
simcol[as.numeric(plotdata$similarity[n:3])<0.95] <- "Red"
simcol[as.numeric(plotdata$similarity[n:3])>=0.95] <- "Orange"
simcol[as.numeric(plotdata$similarity[n:3])>=0.98] <- "Yellow"
simcol[as.numeric(plotdata$similarity[n:3])==1] <- "Green"

rect(97, 99.5-n:3, 100, 100.5-n:3, col= simcol, border=NA)

text(97, 100-n:3, as.numeric(plotdata$similarity[n:3])*100, adj=0, cex=0.5)
text(97, 100-1, plotdata$similarity[1], adj=0, cex=0.5)



text(0, 123, OTU, adj=0, cex=2)

text(0, 119, plotdata$Species[3], adj=0, cex=2)
text(0, 116, plotdata$Genus[3], adj=0, cex=2)
text(0, 113, plotdata$Family[3], adj=0, cex=2)
text(0, 110, plotdata$Order[3], adj=0, cex=2)

text(0, 105, as.numeric(plotdata$similarity[3])*100, adj=0, cex=2)

head(plotdata)


dev.off()
}

